#!/usr/bin/env python3
#
# linearize-data.py: Construct a linear, no-fork version of the chain.
#
# Copyright (c) 2013-2016 The Bitcoin Core developers
# Distributed under the MIT software license, see the accompanying
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
#

from __future__ import division, print_function

import datetime
import hashlib
import os
import os.path
import re
import struct
import sys
import time
from binascii import unhexlify
from collections import namedtuple
from typing import Any, Dict

settings: Dict[str, Any] = {}


def hex_switchEndian(s):
    """Switches the endianness of a hex string (in pairs of hex chars)"""
    pairList = [s[i : i + 2].encode() for i in range(0, len(s), 2)]
    return b"".join(pairList[::-1]).decode()


def uint32(x):
    return x & 0xFFFFFFFF


def bytereverse(x):
    return uint32(
        (
            ((x) << 24)
            | (((x) << 8) & 0x00FF0000)
            | (((x) >> 8) & 0x0000FF00)
            | ((x) >> 24)
        )
    )


def bufreverse(in_buf):
    out_words = []
    for i in range(0, len(in_buf), 4):
        word = struct.unpack("@I", in_buf[i : i + 4])[0]
        out_words.append(struct.pack("@I", bytereverse(word)))
    return b"".join(out_words)


def wordreverse(in_buf):
    out_words = []
    for i in range(0, len(in_buf), 4):
        out_words.append(in_buf[i : i + 4])
    out_words.reverse()
    return b"".join(out_words)


def calc_hdr_hash(blk_hdr):
    hash1 = hashlib.sha256()
    hash1.update(blk_hdr)
    hash1_o = hash1.digest()

    hash2 = hashlib.sha256()
    hash2.update(hash1_o)
    hash2_o = hash2.digest()

    return hash2_o


def calc_hash_str(blk_hdr):
    blockhash = calc_hdr_hash(blk_hdr)
    blockhash = bufreverse(blockhash)
    blockhash = wordreverse(blockhash)
    hash_str = blockhash.hex()
    return hash_str


def get_blk_dt(blk_hdr):
    members = struct.unpack("<I", blk_hdr[68 : 68 + 4])
    nTime = members[0]
    dt = datetime.datetime.fromtimestamp(nTime)
    dt_ym = datetime.datetime(dt.year, dt.month, 1)
    return (dt_ym, nTime)


# When getting the list of block hashes, undo any byte reversals.


def get_block_hashes(settings):
    blkindex = []
    f = open(settings["hashlist"], "r", encoding="utf8")
    for line in f:
        line = line.rstrip()
        if settings["rev_hash_bytes"] == "true":
            line = hex_switchEndian(line)
        blkindex.append(line)

    print("Read " + str(len(blkindex)) + " hashes")

    return blkindex


# The block map shouldn't give or receive byte-reversed hashes.


def mkblockmap(blkindex):
    blkmap = {}
    for height, blockhash in enumerate(blkindex):
        blkmap[blockhash] = height
    return blkmap


# Block header and extent on disk
BlockExtent = namedtuple("BlockExtent", ["fn", "offset", "inhdr", "blkhdr", "size"])


class BlockDataCopier:
    def __init__(self, settings, blkindex, blkmap):
        self.settings = settings
        self.blkindex = blkindex
        self.blkmap = blkmap

        self.inFn = 0
        self.inF = None
        self.outFn = 0
        self.outsz = 0
        self.outF = None
        self.outFname = None
        self.blkCountIn = 0
        self.blkCountOut = 0

        self.lastDate = datetime.datetime(2000, 1, 1)
        self.highTS = 1408893517 - 315360000
        self.timestampSplit = False
        self.fileOutput = True
        self.setFileTime = False
        self.maxOutSz = settings["max_out_sz"]
        if "output" in settings:
            self.fileOutput = False
        if settings["file_timestamp"] != 0:
            self.setFileTime = True
        if settings["split_timestamp"] != 0:
            self.timestampSplit = True
        # Extents and cache for out-of-order blocks
        self.blockExtents = {}
        self.outOfOrderData = {}
        self.outOfOrderSize = 0  # running total size for items in outOfOrderData

    def writeBlock(self, inhdr, blk_hdr, rawblock):
        blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
        if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
            self.outF.close()
            if self.setFileTime:
                os.utime(self.outFname, (int(time.time()), self.highTS))
            self.outF = None
            self.outFname = None
            self.outFn = self.outFn + 1
            self.outsz = 0

        (blkDate, blkTS) = get_blk_dt(blk_hdr)
        if self.timestampSplit and (blkDate > self.lastDate):
            print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
            self.lastDate = blkDate
            if self.outF:
                self.outF.close()
                if self.setFileTime:
                    os.utime(self.outFname, (int(time.time()), self.highTS))
                self.outF = None
                self.outFname = None
                self.outFn = self.outFn + 1
                self.outsz = 0

        if not self.outF:
            if self.fileOutput:
                self.outFname = self.settings["output_file"]
            else:
                self.outFname = os.path.join(
                    self.settings["output"], f"blk{self.outFn:05d}.dat"
                )
            print("Output file " + self.outFname)
            self.outF = open(self.outFname, "wb")

        self.outF.write(inhdr)
        self.outF.write(blk_hdr)
        self.outF.write(rawblock)
        self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)

        self.blkCountOut = self.blkCountOut + 1
        if blkTS > self.highTS:
            self.highTS = blkTS

        if (self.blkCountOut % 1000) == 0:
            print(
                f"{self.blkCountIn} blocks scanned, {self.blkCountOut} blocks written (of {len(self.blkindex)}, {100.0 * self.blkCountOut / len(self.blkindex):.1f}% complete)"
            )

    def inFileName(self, fn):
        return os.path.join(self.settings["input"], f"blk{fn:05d}.dat")

    def fetchBlock(self, extent):
        """Fetch block contents from disk given extents"""
        with open(self.inFileName(extent.fn), "rb") as f:
            f.seek(extent.offset)
            return f.read(extent.size)

    def copyOneBlock(self):
        """Find the next block to be written in the input, and copy it to the output."""
        extent = self.blockExtents.pop(self.blkCountOut)
        if self.blkCountOut in self.outOfOrderData:
            # If the data is cached, use it from memory and remove from the
            # cache
            rawblock = self.outOfOrderData.pop(self.blkCountOut)
            self.outOfOrderSize -= len(rawblock)
        else:  # Otherwise look up data on disk
            rawblock = self.fetchBlock(extent)

        self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)

    def run(self):
        while self.blkCountOut < len(self.blkindex):
            if not self.inF:
                fname = self.inFileName(self.inFn)
                print("Input file " + fname)
                try:
                    self.inF = open(fname, "rb")
                except IOError:
                    print("Premature end of block data")
                    return

            inhdr = self.inF.read(8)
            if not inhdr or (inhdr[0] == "\0"):
                self.inF.close()
                self.inF = None
                self.inFn = self.inFn + 1
                continue

            inMagic = inhdr[:4]
            if inMagic != self.settings["netmagic"]:
                print("Invalid magic: " + inMagic.hex())
                return
            inLenLE = inhdr[4:]
            su = struct.unpack("<I", inLenLE)
            inLen = su[0] - 80  # length without header
            blk_hdr = self.inF.read(80)
            inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)

            self.hash_str = calc_hash_str(blk_hdr)
            if self.hash_str not in blkmap:
                # Because blocks can be written to files out-of-order as of 0.10, the script
                # may encounter blocks it doesn't know about. Treat as debug
                # output.
                if settings["debug_output"] == "true":
                    print("Skipping unknown block " + self.hash_str)
                self.inF.seek(inLen, os.SEEK_CUR)
                continue

            blkHeight = self.blkmap[self.hash_str]
            self.blkCountIn += 1

            if self.blkCountOut == blkHeight:
                # If in-order block, just copy
                rawblock = self.inF.read(inLen)
                self.writeBlock(inhdr, blk_hdr, rawblock)

                # See if we can catch up to prior out-of-order blocks
                while self.blkCountOut in self.blockExtents:
                    self.copyOneBlock()

            else:  # If out-of-order, skip over block data for now
                self.blockExtents[blkHeight] = inExtent
                if self.outOfOrderSize < self.settings["out_of_order_cache_sz"]:
                    # If there is space in the cache, read the data
                    # Reading the data in file sequence instead of seeking and fetching it later is preferred,
                    # but we don't want to fill up memory
                    self.outOfOrderData[blkHeight] = self.inF.read(inLen)
                    self.outOfOrderSize += inLen
                else:  # If no space in cache, seek forward
                    self.inF.seek(inLen, os.SEEK_CUR)

        print(f"Done ({self.blkCountOut} blocks written)")


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: linearize-data.py CONFIG-FILE")
        sys.exit(1)

    f = open(sys.argv[1], encoding="utf8")
    for line in f:
        # skip comment lines
        m = re.search(r"^\s*#", line)
        if m:
            continue

        # parse key=value lines
        m = re.search(r"^(\w+)\s*=\s*(\S.*)$", line)
        if m is None:
            continue
        settings[m.group(1)] = m.group(2)
    f.close()

    # Force hash byte format setting to be lowercase to make comparisons easier.
    # Also place upfront in case any settings need to know about it.
    if "rev_hash_bytes" not in settings:
        settings["rev_hash_bytes"] = "false"
    settings["rev_hash_bytes"] = settings["rev_hash_bytes"].lower()

    if "netmagic" not in settings:
        settings["netmagic"] = "f9beb4d9"
    if "genesis" not in settings:
        settings["genesis"] = (
            "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f"
        )
    if "input" not in settings:
        settings["input"] = "input"
    if "hashlist" not in settings:
        settings["hashlist"] = "hashlist.txt"
    if "file_timestamp" not in settings:
        settings["file_timestamp"] = 0
    if "split_timestamp" not in settings:
        settings["split_timestamp"] = 0
    if "max_out_sz" not in settings:
        settings["max_out_sz"] = 1000 * 1000 * 1000
    if "out_of_order_cache_sz" not in settings:
        settings["out_of_order_cache_sz"] = 100 * 1000 * 1000
    if "debug_output" not in settings:
        settings["debug_output"] = "false"

    settings["max_out_sz"] = int(settings["max_out_sz"])
    settings["split_timestamp"] = int(settings["split_timestamp"])
    settings["file_timestamp"] = int(settings["file_timestamp"])
    settings["netmagic"] = unhexlify(settings["netmagic"].encode("utf-8"))
    settings["out_of_order_cache_sz"] = int(settings["out_of_order_cache_sz"])
    settings["debug_output"] = settings["debug_output"].lower()

    if "output_file" not in settings and "output" not in settings:
        print("Missing output file / directory")
        sys.exit(1)

    blkindex = get_block_hashes(settings)
    blkmap = mkblockmap(blkindex)

    # Block hash map won't be byte-reversed. Neither should the genesis hash.
    if settings["genesis"] not in blkmap:
        print("Genesis block not found in hashlist")
    else:
        BlockDataCopier(settings, blkindex, blkmap).run()
